# RUN: llc -run-pass=si-insert-waitcnts -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s -check-prefixes=CHECK,SI
# RUN: llc -run-pass=si-insert-waitcnts -march=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9
# RUN: llc -run-pass=si-insert-waitcnts -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck %s
# RUN: llc -run-pass=si-insert-waitcnts -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -o - %s | FileCheck %s
---
# CHECK-LABEL: name: vccz_corrupt_workaround
# CHECK: $vcc = V_CMP_EQ_F32
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: $vcc = S_MOV_B64 $vcc
# CHECK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit killed $vcc

name: vccz_corrupt_workaround
tracksRegLiveness: true
body: |
  bb.0:
    liveins: $sgpr0_sgpr1

    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 11, 0
    $sgpr7 = S_MOV_B32 61440
    $sgpr6 = S_MOV_B32 -1
    $vcc = V_CMP_EQ_F32_e64 0, 0, 0, undef $sgpr2, 0, implicit $mode, implicit $exec
    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc

  bb.2:
    liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003

    $vgpr0 = V_MOV_B32_e32 9, implicit $exec
    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
    S_BRANCH %bb.3

  bb.1:
    liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003

    $vgpr0 = V_MOV_B32_e32 100, implicit $exec
    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
    $vgpr0 = V_MOV_B32_e32 1, implicit $exec

  bb.3:
    liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003

    $sgpr3 = S_MOV_B32 61440
    $sgpr2 = S_MOV_B32 -1
    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_ENDPGM 0

...
---
# CHECK-LABEL: name: vccz_corrupt_undef_vcc
# CHECK: BUFFER_STORE_DWORD_OFFSET
# SI-NEXT: S_WAITCNT 3855
# CHECK-NEXT: $vgpr0 = V_MOV_B32_e32

name: vccz_corrupt_undef_vcc
tracksRegLiveness: true
body: |
  bb.0:
    liveins: $sgpr0_sgpr1

    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 11, 0
    $sgpr7 = S_MOV_B32 61440
    $sgpr6 = S_MOV_B32 -1
    S_CBRANCH_VCCZ %bb.1, implicit undef $vcc

  bb.2:
    liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003

    $vgpr0 = V_MOV_B32_e32 9, implicit $exec
    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
    S_BRANCH %bb.3

  bb.1:
    liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003

    $vgpr0 = V_MOV_B32_e32 100, implicit $exec
    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
    $vgpr0 = V_MOV_B32_e32 1, implicit $exec

  bb.3:
    liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003

    $sgpr3 = S_MOV_B32 61440
    $sgpr2 = S_MOV_B32 -1
    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_ENDPGM 0

...
---
# Test that after reloading vcc spilled to a vgpr, we insert any necessary
# instructions to fix vccz.

# CHECK-LABEL: name: reload_vcc_from_vgpr
# CHECK: $vcc_lo = V_READLANE_B32 $vgpr0, 8, implicit-def $vcc
# CHECK: $vcc_hi = V_READLANE_B32 $vgpr0, 9
# SI:    $vcc = S_MOV_B64 $vcc
# GFX9:  $vcc = S_MOV_B64 $vcc
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc

name: reload_vcc_from_vgpr
body: |
  bb.0:
    $vcc_lo = V_READLANE_B32 $vgpr0, 8, implicit-def $vcc
    $vcc_hi = V_READLANE_B32 $vgpr0, 9
    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
  bb.1:

...
---
# Test that after reloading vcc spilled to memory, we insert any necessary
# instructions to fix vccz.

# CHECK-LABEL: name: reload_vcc_from_mem
# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
# CHECK: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec
# CHECK: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
# SI:    $vcc = S_MOV_B64 $vcc
# GFX9:  $vcc = S_MOV_B64 $vcc
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc

name: reload_vcc_from_mem
body: |
  bb.0:
    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
    $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec
    $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
  bb.1:

...
---
# Test that after inline asm that defines vcc_lo, we insert any necessary
# instructions to fix vccz.

# CHECK-LABEL: name: inlineasm_def_vcc_lo
# CHECK: INLINEASM &"; def vcc_lo", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $vcc_lo
# SI:    $vcc = S_MOV_B64 $vcc
# GFX9:  $vcc = S_MOV_B64 $vcc
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc

name: inlineasm_def_vcc_lo
body: |
  bb.0:
    INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo
    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
  bb.1:

...
---
# Test that after inline asm that defines vcc, no unnecessary instructions are
# inserted to fix vccz.

# CHECK-LABEL: name: inlineasm_def_vcc
# CHECK: INLINEASM &"; def vcc", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $vcc
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc

name: inlineasm_def_vcc
body: |
  bb.0:
    INLINEASM &"; def vcc", 1, 10, implicit-def $vcc
    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
  bb.1:

...
---
# Test vcc definition in a previous basic block.

# CHECK-LABEL: name: vcc_def_pred
# CHECK: bb.1:
# SI:  $vcc = S_MOV_B64 $vcc
# GFX9:  $vcc = S_MOV_B64 $vcc
# CHECK: S_CBRANCH_VCCZ %bb.2, implicit $vcc

name: vcc_def_pred
body: |
  bb.0:
    $vcc = S_MOV_B64 0
  bb.1:
    S_CBRANCH_VCCZ %bb.2, implicit $vcc
  bb.2:

...

# Test various ways that the live range of vccz can overlap with the live range
# of an outstanding smem load.

---
# CHECK-LABEL: name: load_wait_def_use
# SI: S_WAITCNT 0
# SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: $vcc = S_MOV_B64 0
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: load_wait_def_use
body: |
  bb.0:
    $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
    S_WAITCNT 127
    $vcc = S_MOV_B64 0
    S_CBRANCH_VCCZ %bb.1, implicit $vcc
  bb.1:
...

---
# CHECK-LABEL: name: load_wait_nop_def_use
# SI: S_WAITCNT 0
# SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: S_NOP 0
# SI-NEXT: $vcc = S_MOV_B64 0
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: load_wait_nop_def_use
body: |
  bb.0:
    $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
    S_WAITCNT 127
    S_NOP 0
    $vcc = S_MOV_B64 0
    S_CBRANCH_VCCZ %bb.1, implicit $vcc
  bb.1:
...

---
# CHECK-LABEL: name: load_def_wait_use
# SI: S_WAITCNT 0
# SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
# SI-NEXT: $vcc = S_MOV_B64 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: $vcc = S_MOV_B64 $vcc
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: load_def_wait_use
body: |
  bb.0:
    $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
    $vcc = S_MOV_B64 0
    S_WAITCNT 127
    S_CBRANCH_VCCZ %bb.1, implicit $vcc
  bb.1:
...

# CHECK-LABEL: name: load_def_wait_nop_use
# SI: S_WAITCNT 0
# SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
# SI-NEXT: $vcc = S_MOV_B64 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: S_NOP 0
# SI-NEXT: $vcc = S_MOV_B64 $vcc
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: load_def_wait_nop_use
body: |
  bb.0:
    $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
    $vcc = S_MOV_B64 0
    S_WAITCNT 127
    S_NOP 0
    S_CBRANCH_VCCZ %bb.1, implicit $vcc
  bb.1:
...

---
# CHECK-LABEL: name: load_def_use
# SI: S_WAITCNT 0
# SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
# SI-NEXT: $vcc = S_MOV_B64 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: $vcc = S_MOV_B64 $vcc
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: load_def_use
body: |
  bb.0:
    $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
    $vcc = S_MOV_B64 0
    S_CBRANCH_VCCZ %bb.1, implicit $vcc
  bb.1:
...

---
# CHECK-LABEL: name: def_load_wait_use
# SI: S_WAITCNT 0
# SI-NEXT: $vcc = S_MOV_B64 0
# SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: $vcc = S_MOV_B64 $vcc
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: def_load_wait_use
body: |
  bb.0:
    $vcc = S_MOV_B64 0
    $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
    S_WAITCNT 127
    S_CBRANCH_VCCZ %bb.1, implicit $vcc
  bb.1:
...

---
# CHECK-LABEL: name: def_load_wait_nop_use
# SI: S_WAITCNT 0
# SI-NEXT: $vcc = S_MOV_B64 0
# SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: S_NOP 0
# SI-NEXT: $vcc = S_MOV_B64 $vcc
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: def_load_wait_nop_use
body: |
  bb.0:
    $vcc = S_MOV_B64 0
    $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
    S_WAITCNT 127
    S_NOP 0
    S_CBRANCH_VCCZ %bb.1, implicit $vcc
  bb.1:
...

---
# CHECK-LABEL: name: def_load_use
# SI: S_WAITCNT 0
# SI-NEXT: $vcc = S_MOV_B64 0
# SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: $vcc = S_MOV_B64 $vcc
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: def_load_use
body: |
  bb.0:
    $vcc = S_MOV_B64 0
    $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
    S_CBRANCH_VCCZ %bb.1, implicit $vcc
  bb.1:
...
